library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally)
library(mgcv) # gams
library(FNN) # Nearest neighbours

Load preprocessed data (Preprocessing/Gandal/AllRegions/20_02_27_data_preprocessing.html)

# Gandal dataset
load('./../../Gandal/AllRegions/Data/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
DE_info = DE_info %>% data.frame

# GO Neuronal annotations
GO_annotations = read.csv('./../../Gandal/AllRegions/Data/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>% 
              mutate('ID'=as.character(ensembl_gene_id)) %>% 
              dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
              mutate('Neuronal'=1)

# SFARI Genes
SFARI_genes = read_csv('./../Data/SFARI_genes_08-29-2019_with_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]


# Update DE_info with SFARI and Neuronal information
genes_info = DE_info %>% dplyr::mutate('ID'=rownames(.)) %>% left_join(SFARI_genes, by='ID') %>% 
  dplyr::mutate(`gene-score`=ifelse(is.na(`gene-score`), 'Others', `gene-score`)) %>%
  left_join(GO_neuronal, by='ID') %>% dplyr::mutate(Neuronal=ifelse(is.na(Neuronal), 0, Neuronal)) %>%
  dplyr::mutate(gene.score=ifelse(`gene-score`=='Others' & Neuronal==1, 'Neuronal', `gene-score`), 
                significant=padj<0.05 & !is.na(padj))

SFARI_colour_hue = function(r) {
  pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}

rm(dds, DE_info, GO_annotations, GO_neuronal, datGenes)

Fold-Change Direction


The proportion of over- and under-expressed genes in each SFARI Gene score is not very different to the proportion in the genes iwth Neuronal annotations nor in the rest of the genes (good, something less to worry about)

aux = genes_info %>% dplyr::select(ID, log2FoldChange, gene.score) %>%
      left_join(data.frame('ID' = rownames(datExpr), 'meanExpr' = rowMeans(datExpr)), by = 'ID') %>%
      dplyr::mutate(direction = ifelse(log2FoldChange>0, 'over-expressed', 'under-expressed'))

plot_data = aux %>% group_by(gene.score, direction) %>% tally(name = 'p') %>% left_join(aux %>% group_by(gene.score) %>% tally, by = 'gene.score') %>%
            mutate(p = p/n, y=1)


plot_data %>% ggplot(aes(gene.score, p, fill=direction)) + geom_bar(stat='identity') + 
              geom_hline(yintercept = mean(plot_data$p[plot_data$direction=='under-expressed']), linetype = 'dashed', color = 'white') + 
              ylab('Percentage') + xlab('SFARI Gene Scores') + ggtitle('Direction of Fold-Change in genes by SFARI Score') + theme_minimal()

rm(aux)

Fold-Change Magnitude


There is a negative relation between the magnitude of the LFC and the SFARI Gene scores. This would suggest that DEA is not a good approach to identify new SFARI Genes

ggplotly(genes_info %>% ggplot(aes(x=gene.score, y=abs(log2FoldChange), fill=gene.score)) + 
         geom_boxplot() + scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + 
         ylab('logFoldChange Magnitude') + xlab('SFARI Gene Score') + theme_minimal() + theme(legend.position='none'))

We know that in general there is a negative relation between mean expression and lFC in genes, and we also know that there is a strong relation between SFARI Gene Scores and the mean level of expression of the genes.

This could explain the behaviour we found above, but we want to see if, once you control for the level of expression, the SFARI genes continue to have this relation to LFC or if it dissapears. (Being optimistic, perhaps the SFARI genes actually have higher LFC than genes with similar levels of expression, but we can’t see this in the original plot because of the relation between level of expression and LFC)

plot_data = genes_info %>% dplyr::select(ID, log2FoldChange, gene.score, significant) %>%
            left_join(data.frame('ID' = rownames(datExpr), 'meanExpr' = rowMeans(datExpr)), by = 'ID') %>%
            mutate(alpha = ifelse(gene.score == 'Others' , 0.1, ifelse(gene.score == 'Neuronal', 0.3, 0.7)))

p1 = plot_data %>% ggplot(aes(gene.score, meanExpr, fill=gene.score)) + geom_boxplot() +
     scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + theme_minimal() +
     ylab('Mean Level of Expression') + xlab('SFARI Gene Score') + theme(legend.position='none')

p2 = plot_data %>% ggplot(aes(meanExpr, abs(log2FoldChange), color = gene.score)) + geom_point(alpha=plot_data$alpha) + geom_smooth(method='lm', color='#999999') + 
     ylab('LogFoldChange Magnitude') + xlab('Mean Expression') + scale_color_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + 
     theme_minimal() + theme(legend.position = 'none')

p2 = ggExtra::ggMarginal(p2, type='density',groupColour = TRUE, size=10)

grid.arrange(p2, p1, ncol=2, widths = c(0.6, 0.4))

rm(p1,p2)

Fold-Change Magnitude controlling by level of expression


We want to know what happens to the originally negative relation we found between SFARI Gene scores and lFC magnitude when we control for level of expression.

To do this, I’m going to compare each SFARI Gene with its closest non-SFARI neighbours following these steps:

  1. Select one SFARI gene

  2. Select its neighbours: 100 non-SFARI genes with the most similar mean level of Expression

  3. Standardise the lFC magnitude of each of the neighbours and of the SFARI gene (using the mean and sd of the lFC magnitude of only these 101 genes)

  4. Repeat this for each of the SFARI Genes, saving the standardised lFC magnitudes of all the SFARI genes and all the neighbours

  5. Compare the distribution of this value between these two groups (SFARI and their neighbours)


This plot shows the general idea of steps 1, 2, and 3, selecting a random SFARI gene:

n = 100

plot_data = genes_info %>% dplyr::select(ID, log2FoldChange, gene.score) %>%
            left_join(data.frame('ID' = rownames(datExpr), 'meanExpr' = rowMeans(datExpr)), by = 'ID')

SFARI_gene = plot_data %>% filter(gene.score %in% c('1','2','3','4','5','6')) %>% sample_n(1) %>% mutate(d=0, alpha = 1)
nn = plot_data %>% filter(gene.score %in% c('Neuronal','Others')) %>% mutate(d = abs(meanExpr-SFARI_gene$meanExpr), alpha=0.5) %>% top_n(n=-n, wt = d)

plot_data = rbind(SFARI_gene, nn) %>% mutate(std_magnitude = (abs(log2FoldChange) - mean(abs(log2FoldChange)))/sd(abs(log2FoldChange)))

p1 = plot_data %>% ggplot(aes(meanExpr, abs(log2FoldChange), color = gene.score)) + geom_point(alpha = plot_data$alpha) + 
     scale_color_manual(values=SFARI_colour_hue(r=c(as.numeric(SFARI_gene$gene.score),8,7))) + 
     xlab('Mean Expression') + ylab('Log2 Fold Change Magnitude') + theme_minimal() + theme(legend.position='none')

p2 = plot_data %>% ggplot(aes(meanExpr, std_magnitude, color = gene.score)) + geom_point(alpha = plot_data$alpha) + 
     geom_hline(aes(yintercept = mean(std_magnitude)), linetype = 'dashed', color = '#999999') + 
     scale_color_manual(values=SFARI_colour_hue(r=c(as.numeric(SFARI_gene$gene.score),8,7))) + 
     geom_segment(aes(x=SFARI_gene$meanExpr, y=mean(std_magnitude), xend = SFARI_gene$meanExpr, yend = std_magnitude[1]),
                  alpha = 0.5, color = SFARI_colour_hue(r=1:8)[as.numeric(SFARI_gene$gene.score)]) +
     xlab('Mean Expression') + ylab('Standardised LFC Magnitude') + theme_minimal() + theme(legend.position='none')
for(i in 1:15){
random_sample = plot_data %>% filter(gene.score != SFARI_gene$gene.score) %>% sample_n(1)
p2 = p2 + geom_segment(x=random_sample$meanExpr, xend = random_sample$meanExpr, y=mean(plot_data$std_magnitude), yend = random_sample$std_magnitude,
                  alpha = 0.5, color = 'gray')  
}

grid.arrange(p1, p2, ncol=2, top='Comparing SFARI Genes with their n closest neighbours by Mean Expression')

cat(paste0('SFARI gene\'s standardised distance to its neigbours\'s LFC magnitude: ', round(plot_data$std_magnitude[1],4)))
## SFARI gene's standardised distance to its neigbours's LFC magnitude: 1.4202
rm(p1, p2, SFARI_gene, nn, random_sample, i)

As steps 4, and 5, say, we repeat this for all of the SFARI Genes, recording their standardised mangnitude as well as the ones from their neighbours so we can study them all together


Results


Even when controlling for the relation between Mean Expression and LFC by comparing each SFARI Gene only with neighbouring genes, we see the same results as before!

  • Neuronal genes have higher magnitudes of lFC than non-SFARI, non-neuronal genes consistently (makes sense)

  • The higher the SFARI Gene score, the lower the LFC (even with genes with the same level of expression!)

  • Only SFARI Genes with score 6 have LFC magnitudes higher than their neighbours

get_std_lfc_magnitudes = function(data_info, SFARI_score, n){
  
  SFARI_genes = data_info %>% filter(gene.score == as.character(SFARI_score))
  
  std_magnitudes = data.frame(gene.score = as.character(), std_magnitude = as.numeric)
  
  for(i in 1:nrow(SFARI_genes)){
    SFARI_gene = SFARI_genes[i,]
    nn = data_info %>% filter(gene.score %in% c('Neuronal','Others')) %>% mutate(d = abs(meanExpr-SFARI_gene$meanExpr)) %>% top_n(n=-n, wt = d) %>% dplyr::select(-d)
    iter_data = rbind(SFARI_gene, nn) %>% mutate(std_magnitude = (abs(log2FoldChange) - mean(abs(log2FoldChange)))/sd(abs(log2FoldChange))) %>%
                 dplyr::select(gene.score, std_magnitude)
    std_magnitudes = rbind(std_magnitudes, iter_data)
  }
  
  return(std_magnitudes)
}

data_info = genes_info %>% dplyr::select(ID, log2FoldChange, gene.score) %>% 
            left_join(data.frame('ID' = rownames(datExpr), 'meanExpr' = rowMeans(datExpr)), by = 'ID')

std_magnitudes_score_1 = get_std_lfc_magnitudes(data_info, 1, n)
std_magnitudes_score_2 = get_std_lfc_magnitudes(data_info, 2, n)
std_magnitudes_score_3 = get_std_lfc_magnitudes(data_info, 3, n)
std_magnitudes_score_4 = get_std_lfc_magnitudes(data_info, 4, n)
std_magnitudes_score_5 = get_std_lfc_magnitudes(data_info, 5, n)
std_magnitudes_score_6 = get_std_lfc_magnitudes(data_info, 6, n)

p1 = std_magnitudes_score_1 %>% ggplot(aes(gene.score, std_magnitude, fill = gene.score)) + geom_boxplot() + xlab('') + ylab('Standardised LFC Magnitude') +
                                scale_fill_manual(values=SFARI_colour_hue(r=c(1,8,7))) + theme_minimal() + theme(legend.position = 'none')
p2 = std_magnitudes_score_2 %>% ggplot(aes(gene.score, std_magnitude, fill = gene.score)) + geom_boxplot() + xlab('') + ylab('') +
                                scale_fill_manual(values=SFARI_colour_hue(r=c(2,8,7))) + theme_minimal() + theme(legend.position = 'none')
p3 = std_magnitudes_score_3 %>% ggplot(aes(gene.score, std_magnitude, fill = gene.score)) + geom_boxplot() + xlab('') + ylab('') +
                                scale_fill_manual(values=SFARI_colour_hue(r=c(3,8,7))) + theme_minimal() + theme(legend.position = 'none')
p4 = std_magnitudes_score_4 %>% ggplot(aes(gene.score, std_magnitude, fill = gene.score)) + geom_boxplot() + xlab('') + ylab('Standardised LFC Magnitude') +
                                scale_fill_manual(values=SFARI_colour_hue(r=c(4,8,7))) + theme_minimal() + theme(legend.position = 'none')
p5 = std_magnitudes_score_5 %>% ggplot(aes(gene.score, std_magnitude, fill = gene.score)) + geom_boxplot() + xlab('') + ylab('') +
                                scale_fill_manual(values=SFARI_colour_hue(r=c(5,8,7))) + theme_minimal() + theme(legend.position = 'none')
p6 = std_magnitudes_score_6 %>% ggplot(aes(gene.score, std_magnitude, fill = gene.score)) + geom_boxplot() + xlab('') + ylab('') +
                                scale_fill_manual(values=SFARI_colour_hue(r=c(6,8,7))) + theme_minimal() + theme(legend.position = 'none')

grid.arrange(p1,p2,p3,p4,p5,p6, nrow=2, top = 'Comparison of LFC Magnitude of SFARI gens and their closest neighbours by Mean Expression')

rm(p1,p2,p3,p4,p5,p6, std_magnitudes_score_1, std_magnitudes_score_2, std_magnitudes_score_3, std_magnitudes_score_4, std_magnitudes_score_5, std_magnitudes_score_6)